/* 
    Purpose: Using the 1950 Census, this file locates black and white women aged 
             30-50 who are mothers of a child younger than 18 in the same household. Other 
             variables necessary to create average predicted mother income (in 2d) are also 
             cleaned.

    Notes: Unable to use family income in 1950 because only sample line persons aged 14+ 
           are asked to report family income. Will use personal income instead.

    Creates: Census1950_mothers_ages30to50.dta
*/

clear
set more off
cd "$Mydirectory1/1_DataSources/CensusData/"

    use ./input/Census1950_1pct_raw.dta, clear //download from IPUMS USA
        tab perwt 

    tempfile fulldata
    save `fulldata'

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** IDENTIFY MOTHERS
***************************
/* Note: Must locate mothers before restricting the sample to sample line 
         respondents, who are unlikely to be <18. In fact, only ~7% of 1950 
         respondents are under 18 and are designated sample persons. Dropping 
         non-sample line people first will drop many respondents <18,
         and thus a lot of Census mothers will not be counted.
*/ 

    keep if age<18 //Restrict to children younger than 18
    keep serial momloc age
    
    replace momloc=. if momloc==0 
    drop if momloc==. //Exclude children without a mother in the house
    
    bysort serial momloc: keep if _n==1 //Keep all unique mother ids. Some mothers will have multiple children in the Census. 
    rename momloc pernum
    drop age

    tempfile children 
    save `children'

    
* Keep the sample of mothers 
    use `fulldata', clear
    merge 1:1 serial pernum using `children'
    keep if _merge==3 
    drop _merge

** Keep mothers who are heads of household
    tab relate
    keep if relate==1

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** SET UP INCOME VARIABLES
***************************

* Restrict to sample-line persons
    keep if slrec==2
    
* Fix income variables
    replace inctot=. if inctot==9999999 
    replace inctot=0 if inctot<0

/* Note: No need to adjust income using the 
         CPI, as Jácome et al put income in
         1950 dollars */

* Only keep people with non-zero and non-missing income 
    foreach var of varlist inctot {
    drop if `var'==0 | `var'==.
    }  
    
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** CREATE OTHER NECESSARY VARIABLES
***************************
  
* Keep black and white mothers ages 30 to 50
    keep if inrange(age,30,50) 
    keep if race==1 | race==2 

    tab statefip, m
    rename statefip fips
 
* Region of current residence   
    * Northeast: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont, New Jersey, New York, Pennsylvania
    * Midwest: Illinois, Indiana, Michigan, Ohio, Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, South Dakota
    /* South: Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia, West Virginia, Alabama,
              Kentucky, Mississippi, Tennessee, Arkansas, Louisiana, Oklahoma, Texas */
    * West: Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah, Wyoming, California, Oregon, Washington --note: Census puts AK and HI in with Pacific division
   
    gen region_merge =.
    replace region_merge =1 if (region==11 | region==12) //Northeast
    replace region_merge =2 if (region==21 | region==22) //Midwest
    replace region_merge =3 if inrange(region,31,33) //South
    replace region_merge =4 if (region==41 | region==42) //West
    tab region, m
    tab region_merge, m
    
    label define region_l 1 "NORTHEAST" 2 "MIDWEST" 3 "SOUTH" 4 "WEST"
    label values region_merge region_l
    tab region_merge, m
    
    gen south_merge = region_merge==3
    tab south_merge, m   

* Education variable 
    gen edu=.
    replace edu=1 if educd<=25 //<grade school (includes people with no schooling)
    replace edu=2 if educd==26 //8th grade
    replace edu=3 if inlist(educd,30,40,50,61) //less than hs
    replace edu=4 if inlist(educd,60,62,63,64) //hs
    replace edu=5 if educd>64 & educ<999 //>hs. "999" = missing
    tab edu, m

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

****************
**** ASSIGN COARSENED OCCS
****************

* Set up variables
    sort occ1950
    replace occ1950=. if occ1950>=980

* Count # of Census occupations in 1950 data
    bysort occ1950: gen nvals = _n ==1
    count if nvals==1 

/* Note: Per Census documentation, OCCSCORE is a constructed 
         2-digit numeric variable that assigns occupational 
         income scores to each occupation. OCCSCORE represents 
         the median total income (in hundreds of 1950 dollars) 
         of all persons with that particular occupation in 1950. 
*/
    replace occscore=occscore*100

* Separate people with occupations in 200's based on self-employment
    replace occ1950=occ1950+1000 if (occ1950>=200 & occ1950<=290) & classwkr==1
    
* Crosswalk Census occupations to coarsened ANES occupations
    merge m:1 occ1950 using ../Crosswalks/Crosswalk_1950Census_toANES.dta
    assert _merge!=1
    drop if _merge==2
    drop _merge
    
    assert occ1950==. if occ1950ej==.
    drop if occ1950ej==. 
 
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

****************
**** SAVE
****************

* Keep relevant variables 
    rename occ1950ej motheroccej

    keep race south_merge motheroccej inctot occscore perwt slwt 
 
    gen census=1
    label var census "Census obs"
    
    foreach var of varlist inctot {
    gen log_mother_`var' = log(`var')
    label var log_mother_`var' "Log `var', Census"
    label var `var' "`var', Census"
    }
  
    compress 
    save ./output/Census1950_mothers_ages30to50.dta, replace 
